dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\property_water_claims_non_cat_fs_v5.csv", header=TRUE)

Claim Partial Dependency XGB Classification dataset exported from a Python notebook

pd_gamma_dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\EDA_Severity_Gamma_PartialDependency.csv", header=TRUE)
pd_normal_dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\EDA _Severity_Normal_PartialDependency.csv", header=TRUE)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.6.5 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(ggplot2)
colnames(dataset)
##   [1] "modeldata_id"                       
##   [2] "systemidstart"                      
##   [3] "systemidend"                        
##   [4] "cal_year"                           
##   [5] "startdate"                          
##   [6] "enddate"                            
##   [7] "startdatetm"                        
##   [8] "enddatetm"                          
##   [9] "ecy"                                
##  [10] "log_ecy"                            
##  [11] "policynumber"                       
##  [12] "policy_uniqueid"                    
##  [13] "policyterm"                         
##  [14] "policytype"                         
##  [15] "effectivedate"                      
##  [16] "expirationdate"                     
##  [17] "policystate"                        
##  [18] "policyform"                         
##  [19] "persistency"                        
##  [20] "companycd"                          
##  [21] "carriercd"                          
##  [22] "agency_group"                       
##  [23] "producername"                       
##  [24] "territory"                          
##  [25] "risknumber"                         
##  [26] "risktype"                           
##  [27] "yearbuilt"                          
##  [28] "log_yearbuilt"                      
##  [29] "sqft"                               
##  [30] "log_sqft"                           
##  [31] "stories"                            
##  [32] "roofcd"                             
##  [33] "roofcd_encd"                        
##  [34] "units"                              
##  [35] "occupancycd"                        
##  [36] "occupancy_encd"                     
##  [37] "allperilded"                        
##  [38] "waterded"                           
##  [39] "protectionclass"                    
##  [40] "constructioncd"                     
##  [41] "constructioncd_encd"                
##  [42] "fire_risk_model_score"              
##  [43] "multipolicyind"                     
##  [44] "multipolicyindumbrella"             
##  [45] "earthquakeumbrellaind"              
##  [46] "usagetype"                          
##  [47] "usagetype_encd"                     
##  [48] "ordinanceorlawpct"                  
##  [49] "functionalreplacementcost"          
##  [50] "homegardcreditind"                  
##  [51] "sprinklersystem"                    
##  [52] "landlordind"                        
##  [53] "rentersinsurance"                   
##  [54] "firealarmtype"                      
##  [55] "burglaryalarmtype"                  
##  [56] "waterdetectiondevice"               
##  [57] "neighborhoodcrimewatchind"          
##  [58] "propertymanager"                    
##  [59] "safeguardplusind"                   
##  [60] "kitchenfireextinguisherind"         
##  [61] "gatedcommunityind"                  
##  [62] "deadboltind"                        
##  [63] "poolind"                            
##  [64] "replacementcostdwellingind"         
##  [65] "replacementvalueind"                
##  [66] "serviceline"                        
##  [67] "equipmentbreakdown"                 
##  [68] "numberoffamilies"                   
##  [69] "insuredage"                         
##  [70] "maritalstatus"                      
##  [71] "insurancescore"                     
##  [72] "overriddeninsurancescore"           
##  [73] "insurancescorevalue"                
##  [74] "insscoretiervalueband"              
##  [75] "financialstabilitytier"             
##  [76] "allcov_wp"                          
##  [77] "cova_wp"                            
##  [78] "cova_ep"                            
##  [79] "cova_deductible"                    
##  [80] "log_cova_deductible"                
##  [81] "cova_limit"                         
##  [82] "log_cova_limit"                     
##  [83] "cova_ic_nc_water"                   
##  [84] "hasclaim"                           
##  [85] "cova_il_nc_water"                   
##  [86] "log_cova_il_nc_water"               
##  [87] "water_risk_3_blk"                   
##  [88] "log_water_risk_3_blk"               
##  [89] "water_risk_fre_3_blk"               
##  [90] "log_water_risk_fre_3_blk"           
##  [91] "water_risk_sev_3_blk"               
##  [92] "log_water_risk_sev_3_blk"           
##  [93] "appl_fail_3_blk"                    
##  [94] "fixture_leak_3_blk"                 
##  [95] "pipe_froze_3_blk"                   
##  [96] "plumb_leak_3_blk"                   
##  [97] "rep_cost_3_blk"                     
##  [98] "ustructure_fail_3_blk"              
##  [99] "waterh_fail_3_blk"                  
## [100] "loaddate"                           
## [101] "customer_cnt_active_policies"       
## [102] "customer_cnt_active_policies_binned"
dataset <- dataset[dataset$cova_il_nc_water>0,]

Quick Overview

str(dataset)
## 'data.frame':    14141 obs. of  102 variables:
##  $ modeldata_id                       : int  369477 1770375 1102527 307428 222322 982521 208890 822803 1171971 368993 ...
##  $ systemidstart                      : int  786325 7261637 3342569 555817 365732 2718577 354808 2184406 3877974 785617 ...
##  $ systemidend                        : int  786325 7261637 3342569 555817 365732 2718577 354808 2184406 3877974 785617 ...
##  $ cal_year                           : int  2011 2019 2016 2011 2011 2016 2010 2014 2017 2012 ...
##  $ startdate                          : Factor w/ 4135 levels "2009-01-01","2009-01-02",..: 731 3653 2557 912 731 2557 397 1895 2923 1096 ...
##  $ enddate                            : Factor w/ 4104 levels "2009-01-09 00:00:00",..: 733 3741 2856 1063 923 2619 698 2159 2957 1403 ...
##  $ startdatetm                        : Factor w/ 22990 levels "2008-01-09 00:00:00",..: 1998 19948 13788 4786 2981 12582 1979 10633 14314 5871 ...
##  $ enddatetm                          : Factor w/ 16232 levels "2009-01-09 00:00:00",..: 2179 13966 9995 4196 2908 9054 2164 7732 10430 4810 ...
##  $ ecy                                : num  0.0958 0.3312 0.9089 0.5037 0.616 ...
##  $ log_ecy                            : num  -2.3455 -1.105 -0.0955 -0.6858 -0.4845 ...
##  $ policynumber                       : Factor w/ 243197 levels "AZF0082147","AZF0221975",..: 1093 2338 2387 23585 23503 23763 22523 1161 29650 1628 ...
##  $ policy_uniqueid                    : int  365451 1523478 968090 290101 193025 869673 185822 717770 1005539 365024 ...
##  $ policyterm                         : int  1 8 5 3 2 7 2 5 8 2 ...
##  $ policytype                         : Factor w/ 2 levels "New","Renewal": 1 2 2 2 2 2 2 2 2 2 ...
##  $ effectivedate                      : Factor w/ 4178 levels "2008-01-09","2008-01-14",..: 444 3621 2566 955 634 2329 440 1938 2668 1113 ...
##  $ expirationdate                     : Factor w/ 4299 levels "2009-01-09","2009-01-14",..: 565 3742 2688 1077 755 2451 561 2059 2789 1235 ...
##  $ policystate                        : Factor w/ 3 levels "AZ","CA","NV": 1 1 1 2 2 2 2 1 2 1 ...
##  $ policyform                         : Factor w/ 9 levels "DF1","DF3","DF6",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ persistency                        : int  0 7 4 15 14 18 13 4 7 1 ...
##  $ companycd                          : int  1 1 1 1 1 1 1 1 17 1 ...
##  $ carriercd                          : Factor w/ 2 levels "CSEICO","CSESG": 1 1 1 1 1 1 1 1 2 1 ...
##  $ agency_group                       : Factor w/ 605 levels "","1ST CENTURY INS SVCS INC.",..: 260 459 39 515 216 455 158 533 568 90 ...
##  $ producername                       : Factor w/ 1267 levels "1ST CENTURY INS SVCS INC.",..: 443 999 313 1098 122 993 766 1143 1201 142 ...
##  $ territory                          : Factor w/ 10 levels "","AZ-A","AZ-T",..: 2 3 2 7 6 7 5 2 6 2 ...
##  $ risknumber                         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ risktype                           : Factor w/ 2 levels "Dwelling","Homeowners": 1 1 1 1 1 1 1 1 1 1 ...
##  $ yearbuilt                          : int  1991 1993 1993 1953 1966 1935 1945 1984 1930 1985 ...
##  $ log_yearbuilt                      : num  7.6 7.6 7.6 7.58 7.58 ...
##  $ sqft                               : int  2500 1600 1400 3100 1500 900 1200 3400 2200 1100 ...
##  $ log_sqft                           : num  7.84 7.38 7.24 8.04 7.37 ...
##  $ stories                            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ roofcd                             : Factor w/ 7 levels "COMPO","MEMBRANE",..: 6 6 6 1 1 1 1 1 1 6 ...
##  $ roofcd_encd                        : int  7 7 7 8 8 8 8 8 8 7 ...
##  $ units                              : int  1 1 1 1 1 1 1 4 3 1 ...
##  $ occupancycd                        : Factor w/ 3 levels "NO","OCCUPIEDNOW",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ occupancy_encd                     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ allperilded                        : int  1000 1000 500 250 500 250 250 1000 1000 1000 ...
##  $ waterded                           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ protectionclass                    : int  3 4 6 6 2 4 3 3 2 2 ...
##  $ constructioncd                     : Factor w/ 5 levels "AF","B","F","M",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ constructioncd_encd                : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ fire_risk_model_score              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ multipolicyind                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ multipolicyindumbrella             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ earthquakeumbrellaind              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ usagetype                          : Factor w/ 7 levels "COC","PRIMARY",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ usagetype_encd                     : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ ordinanceorlawpct                  : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ functionalreplacementcost          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ homegardcreditind                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sprinklersystem                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ landlordind                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rentersinsurance                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ firealarmtype                      : int  0 1 0 0 0 0 0 0 1 0 ...
##  $ burglaryalarmtype                  : int  0 1 0 0 0 0 0 0 1 0 ...
##  $ waterdetectiondevice               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ neighborhoodcrimewatchind          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ propertymanager                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ safeguardplusind                   : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ kitchenfireextinguisherind         : int  1 1 0 0 0 0 0 1 0 1 ...
##  $ gatedcommunityind                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ deadboltind                        : int  1 1 1 0 0 0 0 1 0 1 ...
##  $ poolind                            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ replacementcostdwellingind         : int  1 1 1 0 0 0 1 0 1 0 ...
##  $ replacementvalueind                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ serviceline                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ equipmentbreakdown                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ numberoffamilies                   : int  1 1 1 1 1 1 1 4 3 1 ...
##  $ insuredage                         : int  58 60 57 NA NA NA NA 37 NA 36 ...
##  $ maritalstatus                      : Factor w/ 5 levels "~","Divorced",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ insurancescore                     : Factor w/ 3836 levels "(DOES","~","610",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ overriddeninsurancescore           : Factor w/ 41 levels "~","01","02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ insurancescorevalue                : Factor w/ 92 levels "~","630","645",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ insscoretiervalueband              : Factor w/ 22 levels "~","624-632",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ financialstabilitytier             : Factor w/ 42 levels "","~","01","02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ allcov_wp                          : num  638 465 502 2290 630 ...
##  $ cova_wp                            : num  565 323 448 2290 573 664 419 811 881 309 ...
##  $ cova_ep                            : num  54.1 107 407.2 1153.5 353 ...
##  $ cova_deductible                    : int  1000 1000 500 250 500 250 250 1000 1000 1000 ...
##  $ log_cova_deductible                : num  6.91 6.91 6.21 5.52 6.21 ...
##  $ cova_limit                         : int  300000 200000 200000 900000 300000 300000 200000 400000 400000 200000 ...
##  $ log_cova_limit                     : num  12.5 12.2 12.1 13.7 12.5 ...
##  $ cova_ic_nc_water                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ hasclaim                           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ cova_il_nc_water                   : num  6298 5457 14918 1140 1863 ...
##  $ log_cova_il_nc_water               : num  8.75 8.6 9.61 7.04 7.53 ...
##  $ water_risk_3_blk                   : int  270 238 164 76 121 104 205 162 145 242 ...
##  $ log_water_risk_3_blk               : num  5.6 5.47 5.1 4.33 4.8 ...
##  $ water_risk_fre_3_blk               : int  241 229 184 51 116 77 180 236 135 197 ...
##  $ log_water_risk_fre_3_blk           : num  5.48 5.43 5.21 3.93 4.75 ...
##  $ water_risk_sev_3_blk               : int  115 107 92 153 107 138 117 71 111 126 ...
##  $ log_water_risk_sev_3_blk           : num  4.74 4.67 4.52 5.03 4.67 ...
##  $ appl_fail_3_blk                    : int  5 5 5 3 5 5 1 5 5 5 ...
##  $ fixture_leak_3_blk                 : int  1 4 4 2 2 2 1 1 3 1 ...
##  $ pipe_froze_3_blk                   : int  0 0 2 2 2 2 2 0 2 0 ...
##  $ plumb_leak_3_blk                   : int  5 5 4 1 1 3 4 1 1 5 ...
##  $ rep_cost_3_blk                     : int  4 4 1 1 5 5 5 4 5 5 ...
##  $ ustructure_fail_3_blk              : int  5 5 5 4 5 5 5 5 5 5 ...
##  $ waterh_fail_3_blk                  : int  1 4 4 0 0 0 0 0 0 1 ...
##   [list output truncated]
summary(dataset)
##   modeldata_id     systemidstart      systemidend         cal_year   
##  Min.   :    179   Min.   :    745   Min.   :    745   Min.   :2009  
##  1st Qu.: 536203   1st Qu.:1225297   1st Qu.:1356851   1st Qu.:2013  
##  Median :1007917   Median :2844382   Median :2943304   Median :2015  
##  Mean   :1009292   Mean   :3440017   Mean   :3579325   Mean   :2015  
##  3rd Qu.:1500796   3rd Qu.:5514105   3rd Qu.:5754124   3rd Qu.:2018  
##  Max.   :2102605   Max.   :9136529   Max.   :9610884   Max.   :2020  
##                                                                      
##       startdate                   enddate                  startdatetm   
##  2019-01-01: 986   2019-01-01 00:00:00: 915   2016-10-01 00:00:00:   16  
##  2018-01-01: 854   2017-01-01 00:00:00: 747   2018-05-01 00:00:00:   16  
##  2017-01-01: 705   2018-01-01 00:00:00: 747   2018-08-17 00:00:00:   15  
##  2016-01-01: 694   2020-01-01 00:00:00: 658   2018-02-28 00:00:00:   14  
##  2013-01-01: 642   2015-01-01 00:00:00: 652   2016-10-15 00:00:00:   13  
##  2014-01-01: 632   2014-01-01 00:00:00: 624   2018-09-29 00:00:00:   13  
##  (Other)   :9628   (Other)            :9798   (Other)            :14054  
##                enddatetm          ecy            log_ecy         
##  2019-05-01 00:00:00:   19   Min.   :0.0054   Min.   :-5.221356  
##  2019-08-17 00:00:00:   18   1st Qu.:0.4380   1st Qu.:-0.825536  
##  2017-10-01 00:00:00:   17   Median :0.6516   Median :-0.428324  
##  2014-05-01 00:00:00:   16   Mean   :0.6239   Mean   :-0.581713  
##  2019-07-01 00:00:00:   16   3rd Qu.:0.8295   3rd Qu.:-0.186932  
##  2014-10-19 00:00:00:   15   Max.   :1.0020   Max.   : 0.001998  
##  (Other)            :14040                                       
##      policynumber   policy_uniqueid     policyterm       policytype   
##  CAH6012203:   12   Min.   :    114   Min.   : 1.000   New    : 2906  
##  NVH0643400:    9   1st Qu.: 483369   1st Qu.: 2.000   Renewal:11235  
##  CAH0751641:    8   Median : 887544   Median : 3.000                  
##  CAH0752785:    8   Mean   : 877829   Mean   : 4.066                  
##  CAH0755516:    8   3rd Qu.:1295306   3rd Qu.: 6.000                  
##  CAH0755561:    8   Max.   :1791878   Max.   :12.000                  
##  (Other)   :14088                                                     
##     effectivedate      expirationdate  policystate       policyform  
##  2018-08-17:   20   2019-08-17:   20   AZ: 1210    HO3        :9617  
##  2018-05-01:   19   2019-05-01:   19   CA:12372    DF3        :3427  
##  2016-10-01:   18   2017-10-01:   18   NV:  559    DF6        : 596  
##  2013-10-19:   17   2014-10-19:   17               Form3      : 408  
##  2016-10-15:   16   2017-10-15:   16               FL3-Special:  68  
##  2017-08-01:   16   2018-08-01:   16               FL2-Broad  :  17  
##  (Other)   :14035   (Other)   :14035               (Other)    :   8  
##   persistency       companycd      carriercd   
##  Min.   : 0.000   Min.   : 1.00   CSEICO:5540  
##  1st Qu.: 1.000   1st Qu.: 1.00   CSESG :8601  
##  Median : 4.000   Median :17.00                
##  Mean   : 6.425   Mean   :11.53                
##  3rd Qu.: 9.000   3rd Qu.:17.00                
##  Max.   :44.000   Max.   :19.00                
##                                                
##                                        agency_group  
##  WESTERN GOLD INS AGCY INC.                  :  949  
##  CRUSBERG DECKER INS SVCS INC                :  621  
##  Acrisure of California                      :  591  
##  J.E. BROWN and ASSOCS INS SVCS              :  560  
##  ISU INSURANCE SERVICES OF  SAN FRANCISCO INC:  545  
##  PIIB - PACIFIC INTERSTATE INS               :  379  
##  (Other)                                     :10496  
##                          producername     territory      risknumber
##  WESTERN GOLD INS AGCY INC.    :  926   CA-B   :4481   Min.   :1   
##  J.E. BROWN and ASSOCS INS SVCS:  537   CA-C   :3697   1st Qu.:1   
##  CRUSBERG DECKER INS SVCS INC  :  397   CA-A   :1839   Median :1   
##  Acrisure of California LLC    :  314   CA-O   :1472   Mean   :1   
##  CALIFORNIA INS SPECIALISTS    :  223   AZ-A   :1053   3rd Qu.:1   
##  HUB INTERNATIONAL INS SVCS    :  218   CA-T   : 965   Max.   :2   
##  (Other)                       :11526   (Other): 634               
##        risktype       yearbuilt    log_yearbuilt        sqft     
##  Dwelling  : 4116   Min.   :1900   Min.   :7.533   Min.   : 800  
##  Homeowners:10025   1st Qu.:1972   1st Qu.:7.587   1st Qu.:1500  
##                     Median :1987   Median :7.594   Median :1900  
##                     Mean   :1982   Mean   :7.592   Mean   :2085  
##                     3rd Qu.:1996   3rd Qu.:7.599   3rd Qu.:2500  
##                     Max.   :2019   Max.   :7.610   Max.   :5000  
##                                                                  
##     log_sqft        stories           roofcd      roofcd_encd   
##  Min.   :6.397   Min.   :1.000   COMPO   :5071   Min.   :1.000  
##  1st Qu.:7.315   1st Qu.:1.000   MEMBRANE: 180   1st Qu.:7.000  
##  Median :7.579   Median :1.000   METAL   :  39   Median :7.000  
##  Mean   :7.586   Mean   :1.235   OTHER   :1943   Mean   :7.062  
##  3rd Qu.:7.857   3rd Qu.:1.000   TAR     : 260   3rd Qu.:8.000  
##  Max.   :9.051   Max.   :3.000   TILE    :6493   Max.   :8.000  
##                                  WOOD    : 155                  
##      units            occupancycd    occupancy_encd   allperilded   
##  Min.   :1.000   NO         :    0   Min.   :1.000   Min.   :  100  
##  1st Qu.:1.000   OCCUPIEDNOW:13479   1st Qu.:1.000   1st Qu.:  500  
##  Median :1.000   TENANT     :  662   Median :1.000   Median : 1000  
##  Mean   :1.083                       Mean   :1.047   Mean   : 1144  
##  3rd Qu.:1.000                       3rd Qu.:1.000   3rd Qu.: 1000  
##  Max.   :4.000                       Max.   :2.000   Max.   :10000  
##                                                                     
##     waterded        protectionclass  constructioncd constructioncd_encd
##  Min.   :    0.00   Min.   : 0.000   AF   :4886     Min.   :1.000      
##  1st Qu.:    0.00   1st Qu.: 2.000   B    : 187     1st Qu.:4.000      
##  Median :    0.00   Median : 3.000   F    :8731     Median :5.000      
##  Mean   :   32.18   Mean   : 3.116   M    : 197     Mean   :4.543      
##  3rd Qu.:    0.00   3rd Qu.: 4.000   OTHER: 140     3rd Qu.:5.000      
##  Max.   :10000.00   Max.   :10.000                  Max.   :5.000      
##                                                                        
##  fire_risk_model_score multipolicyind   multipolicyindumbrella
##  Min.   :-1.0000       Min.   :0.0000   Min.   :0.000000      
##  1st Qu.: 0.0000       1st Qu.:0.0000   1st Qu.:0.000000      
##  Median : 0.0000       Median :0.0000   Median :0.000000      
##  Mean   : 0.3155       Mean   :0.1882   Mean   :0.004172      
##  3rd Qu.: 0.0000       3rd Qu.:0.0000   3rd Qu.:0.000000      
##  Max.   :12.0000       Max.   :1.0000   Max.   :1.000000      
##                                                               
##  earthquakeumbrellaind      usagetype     usagetype_encd ordinanceorlawpct
##  Min.   :0.000000      COC       :    4   Min.   :2.00   Min.   :  0.000  
##  1st Qu.:0.000000      PRIMARY   :10222   1st Qu.:6.00   1st Qu.:  0.000  
##  Median :0.000000      RENTAL    : 3859   Median :7.00   Median : 10.000  
##  Mean   :0.002899      SEASONAL  :   46   Mean   :6.71   Mean   :  8.929  
##  3rd Qu.:0.000000      SECONDARY :    8   3rd Qu.:7.00   3rd Qu.: 10.000  
##  Max.   :1.000000      UNOCCUPIED:    0   Max.   :7.00   Max.   :100.000  
##                        VACANT    :    2                                   
##  functionalreplacementcost homegardcreditind sprinklersystem  
##  Min.   :0.0000000         Min.   :0.0000    Min.   :0.00000  
##  1st Qu.:0.0000000         1st Qu.:0.0000    1st Qu.:0.00000  
##  Median :0.0000000         Median :0.0000    Median :0.00000  
##  Mean   :0.0007779         Mean   :0.2037    Mean   :0.03027  
##  3rd Qu.:0.0000000         3rd Qu.:0.0000    3rd Qu.:0.00000  
##  Max.   :1.0000000         Max.   :1.0000    Max.   :1.00000  
##                                                               
##   landlordind      rentersinsurance  firealarmtype    burglaryalarmtype
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   
##  Median :0.00000   Median :0.00000   Median :1.0000   Median :0.0000   
##  Mean   :0.03097   Mean   :0.00396   Mean   :0.6185   Mean   :0.3202   
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0000   
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   
##                                                                        
##  waterdetectiondevice neighborhoodcrimewatchind propertymanager  
##  Min.   :0.0000000    Min.   :0.0000            Min.   :0.00000  
##  1st Qu.:0.0000000    1st Qu.:0.0000            1st Qu.:0.00000  
##  Median :0.0000000    Median :0.0000            Median :0.00000  
##  Mean   :0.0002121    Mean   :0.0203            Mean   :0.01047  
##  3rd Qu.:0.0000000    3rd Qu.:0.0000            3rd Qu.:0.00000  
##  Max.   :1.0000000    Max.   :1.0000            Max.   :1.00000  
##                                                                  
##  safeguardplusind kitchenfireextinguisherind gatedcommunityind
##  Min.   :0.0000   Min.   :0.0000             Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000             1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000             Median :0.00000  
##  Mean   :0.3677   Mean   :0.4544             Mean   :0.01824  
##  3rd Qu.:1.0000   3rd Qu.:1.0000             3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000             Max.   :1.00000  
##                                                               
##   deadboltind        poolind        replacementcostdwellingind
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000            
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000            
##  Median :1.0000   Median :0.00000   Median :0.0000            
##  Mean   :0.7038   Mean   :0.05785   Mean   :0.2305            
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.0000            
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000            
##                                                               
##  replacementvalueind  serviceline     equipmentbreakdown numberoffamilies
##  Min.   :0.00000     Min.   :0.0000   Min.   :0.0000     Min.   :1.000   
##  1st Qu.:0.00000     1st Qu.:0.0000   1st Qu.:0.0000     1st Qu.:1.000   
##  Median :0.00000     Median :0.0000   Median :0.0000     Median :1.000   
##  Mean   :0.02447     Mean   :0.1227   Mean   :0.1326     Mean   :1.083   
##  3rd Qu.:0.00000     3rd Qu.:0.0000   3rd Qu.:0.0000     3rd Qu.:1.000   
##  Max.   :1.00000     Max.   :1.0000   Max.   :1.0000     Max.   :4.000   
##                                                                          
##    insuredage      maritalstatus  insurancescore  overriddeninsurancescore
##  Min.   :  0.00   ~       :6462   ~      :13373   ~      :12898           
##  1st Qu.: 45.00   Divorced:  84   99     :    6   99     :  428           
##  Median : 55.00   Married :3954   ZQWFV  :    5   07     :   86           
##  Mean   : 55.09   Single  :3484   DWKDS  :    4   12     :   66           
##  3rd Qu.: 65.00   Widowed : 157   FWKVS  :    4   13     :   58           
##  Max.   :114.00                   USXQD  :    4   04     :   54           
##  NA's   :1273                     (Other):  745   (Other):  551           
##  insurancescorevalue insscoretiervalueband financialstabilitytier
##  ~      :14137       ~      :14137                :13376         
##  925    :    2       714-731:    2         ~      :  684         
##  729    :    1       894-945:    2         14     :    9         
##  731    :    1       624-632:    0         07     :    8         
##  630    :    0       640-648:    0         13     :    7         
##  645    :    0       656-664:    0         06     :    6         
##  (Other):    0       (Other):    0         (Other):   51         
##    allcov_wp       cova_wp        cova_ep         cova_deductible
##  Min.   :  48   Min.   :   9   Min.   :   1.428   Min.   :  100  
##  1st Qu.: 656   1st Qu.: 635   1st Qu.: 318.021   1st Qu.:  500  
##  Median : 907   Median : 959   Median : 565.492   Median : 1000  
##  Mean   :1021   Mean   :1164   Mean   : 729.635   Mean   : 1144  
##  3rd Qu.:1230   3rd Qu.:1492   3rd Qu.: 952.465   3rd Qu.: 1000  
##  Max.   :7588   Max.   :9468   Max.   :8631.976   Max.   :10000  
##                                                                  
##  log_cova_deductible   cova_limit      log_cova_limit   cova_ic_nc_water
##  Min.   :4.605       Min.   : 100000   Min.   : 8.372   Min.   :1.000   
##  1st Qu.:6.215       1st Qu.: 300000   1st Qu.:12.488   1st Qu.:1.000   
##  Median :6.908       Median : 400000   Median :12.804   Median :1.000   
##  Mean   :6.862       Mean   : 451291   Mean   :12.769   Mean   :1.021   
##  3rd Qu.:6.908       3rd Qu.: 500000   3rd Qu.:13.117   3rd Qu.:1.000   
##  Max.   :9.210       Max.   :1300000   Max.   :14.644   Max.   :3.000   
##                                                                         
##     hasclaim cova_il_nc_water   log_cova_il_nc_water water_risk_3_blk
##  Min.   :1   Min.   :     0.9   Min.   :-0.1054      Min.   :  23.0  
##  1st Qu.:1   1st Qu.:  2953.0   1st Qu.: 7.9906      1st Qu.: 158.0  
##  Median :1   Median :  7088.9   Median : 8.8663      Median : 208.0  
##  Mean   :1   Mean   : 13549.9   Mean   : 8.7756      Mean   : 226.7  
##  3rd Qu.:1   3rd Qu.: 15966.8   3rd Qu.: 9.6783      3rd Qu.: 270.0  
##  Max.   :1   Max.   :522735.2   Max.   :13.1668      Max.   :1056.0  
##                                                                      
##  log_water_risk_3_blk water_risk_fre_3_blk log_water_risk_fre_3_blk
##  Min.   :3.135        Min.   :  21.0       Min.   :3.045           
##  1st Qu.:5.063        1st Qu.: 124.0       1st Qu.:4.820           
##  Median :5.338        Median : 174.0       Median :5.159           
##  Mean   :5.328        Mean   : 184.5       Mean   :5.115           
##  3rd Qu.:5.598        3rd Qu.: 221.0       3rd Qu.:5.398           
##  Max.   :6.962        Max.   :1044.0       Max.   :6.951           
##                                                                    
##  water_risk_sev_3_blk log_water_risk_sev_3_blk appl_fail_3_blk
##  Min.   : 35.0        Min.   :3.555            Min.   :0.000  
##  1st Qu.:109.0        1st Qu.:4.691            1st Qu.:4.000  
##  Median :129.0        Median :4.860            Median :5.000  
##  Mean   :130.6        Mean   :4.845            Mean   :4.257  
##  3rd Qu.:150.0        3rd Qu.:5.011            3rd Qu.:5.000  
##  Max.   :265.0        Max.   :5.580            Max.   :5.000  
##                                                               
##  fixture_leak_3_blk pipe_froze_3_blk plumb_leak_3_blk rep_cost_3_blk 
##  Min.   :0.000      Min.   :0.000    Min.   :0.000    Min.   :0.000  
##  1st Qu.:1.000      1st Qu.:1.000    1st Qu.:1.000    1st Qu.:5.000  
##  Median :2.000      Median :2.000    Median :4.000    Median :5.000  
##  Mean   :1.894      Mean   :1.588    Mean   :3.363    Mean   :4.825  
##  3rd Qu.:3.000      3rd Qu.:2.000    3rd Qu.:5.000    3rd Qu.:5.000  
##  Max.   :5.000      Max.   :5.000    Max.   :5.000    Max.   :5.000  
##                                                                      
##  ustructure_fail_3_blk waterh_fail_3_blk                   loaddate    
##  Min.   :0.000         Min.   :0.000     2020-07-29 05:01:19.56:14141  
##  1st Qu.:4.000         1st Qu.:0.000                                   
##  Median :5.000         Median :2.000                                   
##  Mean   :4.313         Mean   :1.333                                   
##  3rd Qu.:5.000         3rd Qu.:2.000                                   
##  Max.   :5.000         Max.   :5.000                                   
##                                                                        
##  customer_cnt_active_policies customer_cnt_active_policies_binned
##  Min.   : 1.000               Min.   : 1.000                     
##  1st Qu.: 1.000               1st Qu.: 1.000                     
##  Median : 1.000               Median : 1.000                     
##  Mean   : 1.222               Mean   : 1.767                     
##  3rd Qu.: 1.000               3rd Qu.: 1.000                     
##  Max.   :28.000               Max.   :30.000                     
## 

Visualization

Severity

ggplot(dataset, aes(x = .data[['cova_il_nc_water']], color="red",fill="#56B4E9")) +
  geom_histogram(bins=100) +
  labs(x = 'cova_il_nc_water', y = 'Count', title = "Histogram of Losses (cova_il_nc_water)")+
  scale_color_manual(values=c("red")) +  
  scale_fill_manual(values=c("#56B4E9")) +  
  theme(legend.position = "none")

ggplot(dataset, aes(x = .data[['cova_il_nc_water']])) +
  geom_density() + 
  labs(x = 'cova_il_nc_water', y = 'Density', title = "Density of Losses (cova_il_nc_water)")

  vec <- dataset$cova_il_nc_water
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]

ggplot(dataset, aes(sample = .data[['cova_il_nc_water']], col='red')) + 
  stat_qq() + 
  geom_abline(slope = slope, intercept = int) +
  theme(legend.position = "none") +
  labs(y = 'cova_il_nc_water', title = "QQ Plot of Losses (cova_il_nc_water)")

ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']], color="red",fill="#56B4E9")) +
  geom_histogram(bins=100) +
  scale_color_manual(values=c("red")) +  
  scale_fill_manual(values=c("#56B4E9")) + 
  labs(x = 'log(cova_il_nc_water)', y = 'Count', title = "Histogram of Losses (log_cova_il_nc_water)")+
  theme(legend.position = "none")

ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']])) +
  geom_density() + 
  labs(x = 'log(cova_il_nc_water)', y = 'Density', title = 'Density of Losses (log_cova_il_nc_water)')

  vec <- dataset$log_cova_il_nc_water
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]

ggplot(dataset, aes(sample = .data[['log_cova_il_nc_water']], col='red')) + 
  stat_qq() + 
  geom_abline(slope = slope, intercept = int) +
  theme(legend.position = "none") +
  labs(y = 'log(cova_il_nc_water)', title = 'QQ Plot of Losses (log_cova_il_nc_water)')

Looks like gamma distribution and log conversion is more close to normal.

Box Plots of categorical predictors and Losses

plot_categorical <- function(col_name) {
#dataset[[col_name]] <- as.factor(dataset[[col_name]])
#ggplot(dataset, aes(x=.data[[col_name]], y=cova_il_nc_water, col=.data[[col_name]], fill=.data[[col_name]])) +
#  geom_boxplot(notch=notch) +
#  theme(legend.position = "none") +
#  labs(x = col_name, title = paste("Box Plot of Losses and ", col_name))
if ( col_name == 'roofcd_encd') {
col_name_original <- 'roofcd'
} 
  else if ( col_name == 'usagetype_encd' ) {
col_name_original <- 'usagetype'
} 
else if ( col_name == 'constructioncd_encd' ) {
col_name_original <- 'constructioncd'
} 
  else if ( col_name == 'occupancy_encd' ) {
col_name_original <- 'occupancycd'
} 
  else {
  col_name_original <- col_name
}
  
if (grepl('encd',col_name) | 
    col_name =='fire_risk_model_score' | 
    col_name =='customer_cnt_active_policies_binned' |
    col_name =='cova_deductible' | 
    col_name =='cova_limit' | 
    col_name =='protectionclass' | 
    col_name =='ordinanceorlawpct' | 
    col_name =='numberoffamilies' | 
    col_name =='waterded' | 
    col_name =='units' | 
    col_name =='sprinklersystem' |
    col_name =='stories') {
  XBreaks <- as.vector(unlist(unique(dataset[c(col_name)])))
  XLabels <- as.vector(unlist(unique(dataset[c(col_name_original)]))) 
} 
else if (col_name =='pipe_froze_3_blk' | 
col_name =='water_risk_3_blk' | 
col_name =='ustructure_fail_3_blk' | 
col_name =='water_risk_fre_3_blk' | 
col_name =='waterh_fail_3_blk' | 
col_name =='rep_cost_3_blk' | 
col_name =='plumb_leak_3_blk' | 
col_name =='appl_fail_3_blk' | 
col_name =='fixture_leak_3_blk') {
  XBreaks <- c(0,1,2,3,4,5)
  XLabels <- c('Low', 'Elevated', 'Below Avg', 'Average', 'High', 'Highest')    
}
else {
  XBreaks <- c(0,1)
  XLabels <- c("No", "Yes")  

}
  
pd_col <- pd_gamma_dataset[pd_gamma_dataset$feature == col_name,][c('value','pd')]
colnames(pd_col) <- c(col_name,'pd')
plotdata <- dataset %>%
  group_by(.dots = col_name) %>%
  dplyr::summarize(n = n(),
         mean = mean(cova_il_nc_water),
         sd = sd(cova_il_nc_water),
         se = sd / sqrt(n) #,
         #ci = qt(0.975, df = n - 1) * sd / sqrt(n)
         )

plotdata <- merge(plotdata,pd_col,by=col_name)  
cols <- c('Mean Losses with standard error'='#f04546','Losses Partial Dependency'='#3591d1')
p1 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +  
geom_point(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses with standard error'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses with standard error')) +
geom_errorbar(aes(x = .data[[col_name]], y = mean,group = 1, ymin = mean - se,ymax = mean + se,color='Mean Losses with standard error'),width = .1) +
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency'), size = 3) +  
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels)   +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


p2 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +    
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency'), size = 3) +  
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels)   +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

print(p1)
print(p2)

#----------------------
pd_col <- pd_normal_dataset[pd_normal_dataset$feature == col_name,][c('value','pd')]
if (nrow(pd_col)>0) {
colnames(pd_col) <- c(col_name,'pd')
plotdata <- dataset %>%
  group_by(.dots = col_name) %>%
  dplyr::summarize(n = n(),
         mean = mean(log_cova_il_nc_water),
         sd = sd(log_cova_il_nc_water),
         se = sd / sqrt(n) #,
         #ci = qt(0.975, df = n - 1) * sd / sqrt(n)
         )

plotdata <- merge(plotdata,pd_col,by=col_name)  

cols <- c('Mean Losses (Log) with standard error'='#f04546','Losses (Log) Partial Dependency'='#3591d1')
p1 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +    
geom_point(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses (Log) with standard error'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses (Log) with standard error')) +
geom_errorbar(aes(x = .data[[col_name]], y = mean,group = 1, ymin = mean - se,ymax = mean + se,color='Mean Losses (Log) with standard error'),width = .1) +
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency'), size = 3) +  
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels)   +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


p2 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +    
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency'), size = 3) +  
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels)   +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

print(p1)
print(p2)
}
}
plot_categorical('functionalreplacementcost')

plot_categorical('numberoffamilies')

plot_categorical('units')

plot_categorical('stories')

plot_categorical('cova_deductible')

plot_categorical('replacementvalueind')

plot_categorical('neighborhoodcrimewatchind')

plot_categorical('fire_risk_model_score')

ggplot(data=dataset, mapping = aes(x=water_risk_sev_3_blk, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs water_risk_sev_3_blk")

The higest losses are between 100 and 200 water_risk_sev_3_blk. We may have not enough data for higher numbers of water_risk_sev_3_blk

plot_categorical('burglaryalarmtype')

plot_categorical('deadboltind')

plot_categorical('waterh_fail_3_blk')

plot_categorical('safeguardplusind')

plot_categorical('equipmentbreakdown')

plot_categorical('cova_limit')

ggplot(data=dataset, mapping = aes(x=water_risk_3_blk, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs water_risk_3_blk")

There are more higher losses around 250 water_risk_3_blk

plot_categorical('usagetype_encd')

plot_categorical('multipolicyindumbrella')

plot_categorical('constructioncd_encd')

plot_categorical('replacementcostdwellingind')

plot_categorical('roofcd_encd')

plot_categorical('gatedcommunityind')

plot_categorical('homegardcreditind')

plot_categorical('ustructure_fail_3_blk')

ggplot(data=dataset, mapping = aes(x=water_risk_fre_3_blk, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs water_risk_fre_3_blk")

plot_categorical('ecy')

ggplot(data=dataset, mapping = aes(x=ecy, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs ecy")

plot_categorical('appl_fail_3_blk')

plot_categorical('pipe_froze_3_blk')

plot_categorical('firealarmtype')

plot_categorical('fixture_leak_3_blk')

plot_categorical('sqft')

ggplot(data=dataset, mapping = aes(x=sqft, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs sqft")

Severity is higher in larger properties

plot_categorical('yearbuilt')

ggplot(data=dataset, mapping = aes(x=yearbuilt, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs YearBuilt")

Severity is higher in newer properties

plot_categorical('customer_cnt_active_policies_binned')

ggplot(data=dataset, mapping = aes(x=customer_cnt_active_policies_binned, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs customer_cnt_active_policies_binned")

plot_categorical('protectionclass')

plot_categorical('multipolicyind')

plot_categorical('rep_cost_3_blk')

plot_categorical('plumb_leak_3_blk')

plot_categorical('rentersinsurance')

plot_categorical('sprinklersystem')

plot_categorical('kitchenfireextinguisherind')

plot_categorical('poolind')

plot_categorical('landlordind')

plot_categorical('occupancy_encd')

plot_categorical('ordinanceorlawpct')

plot_categorical('waterded')

plot_categorical('serviceline')

plot_categorical('propertymanager')

plot_categorical('earthquakeumbrellaind')

Misc

ggplot(data=dataset, mapping = aes(x=yearbuilt, y=cova_il_nc_water, color=roofcd)) +
  geom_point() +
  labs(title = "Losses vs YearBuilt and Roofcd Limit")

ggplot(data=dataset, mapping = aes(x=sqft, y=cova_il_nc_water, color=-cova_limit)) +
  geom_point() +
  labs(title = "Losses vs Sqft and CovA Limit")